Data Set Link: https://www.kaggle.com/justinas/startup-investments
Project 1
import warnings
warnings.filterwarnings('ignore')
import pandas as pd #importing the pandas module to load our data
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns #importing seaborn to create boxplots
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import pyproj
from shapely.geometry import Point
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as sm
%matplotlib inline
# activate plot theme
import qeds
qeds.themes.mpl_style();
world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
world.columns
Index(['pop_est', 'continent', 'name', 'iso_a3', 'gdp_md_est', 'geometry'], dtype='object')
startup_objects = pd.read_csv("objects.csv", encoding = 'latin-1') #uploading the cvs file into a variable startup_objects #Used to display the data
# Note the file is too large to be displayed entirely
df = startup_objects.drop(index =startup_objects[startup_objects['country_code'] != 'USA'].index)
# Since we are focusing on US data lets remove all non US data and store it in a new variable called df
df.dropna(subset = ['category_code','state_code'], inplace = True) #dropping missing values from the location and category
pd.set_option('display.max_columns', None )
df2 = df.drop(index = df[df['closed_at'] == None].index)
Since the focus of our study is to focus on companies we want to drop any values that represent
df.info() #Checking if empty rows have been dropped.
<class 'pandas.core.frame.DataFrame'> Int64Index: 46419 entries, 0 to 196552 Data columns (total 40 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 46419 non-null object 1 entity_type 46419 non-null object 2 entity_id 46419 non-null int64 3 parent_id 0 non-null object 4 name 46419 non-null object 5 normalized_name 46419 non-null object 6 permalink 46419 non-null object 7 category_code 46419 non-null object 8 status 46419 non-null object 9 founded_at 36437 non-null object 10 closed_at 1311 non-null object 11 domain 45241 non-null object 12 homepage_url 45241 non-null object 13 twitter_username 29918 non-null object 14 logo_url 41969 non-null object 15 logo_width 46419 non-null int64 16 logo_height 46419 non-null int64 17 short_description 4639 non-null object 18 description 32282 non-null object 19 overview 45579 non-null object 20 tag_list 27857 non-null object 21 country_code 46419 non-null object 22 state_code 46419 non-null object 23 city 45597 non-null object 24 region 46419 non-null object 25 first_investment_at 595 non-null object 26 last_investment_at 595 non-null object 27 investment_rounds 46419 non-null int64 28 invested_companies 46419 non-null int64 29 first_funding_at 18777 non-null object 30 last_funding_at 18777 non-null object 31 funding_rounds 46419 non-null int64 32 funding_total_usd 46419 non-null int64 33 first_milestone_at 24142 non-null object 34 last_milestone_at 24142 non-null object 35 milestones 46419 non-null int64 36 relationships 46419 non-null int64 37 created_by 34320 non-null object 38 created_at 46419 non-null object 39 updated_at 46419 non-null object dtypes: int64(9), object(31) memory usage: 15.5+ MB
df['funding_total_usd'].describe() # Summary Statistics for our Y variable
count 4.641900e+04 mean 6.531190e+06 std 4.930020e+07 min 0.000000e+00 25% 0.000000e+00 50% 0.000000e+00 75% 1.254797e+06 max 5.700000e+09 Name: funding_total_usd, dtype: float64
df['category_code'].describe() # Summary Statistics for our X variable category_code
count 46419 unique 42 top software freq 7550 Name: category_code, dtype: object
df['state_code'].describe() # Summary Statistics for our X variable state_code
count 46419 unique 51 top CA freq 15238 Name: state_code, dtype: object
plt.hist(df.funding_total_usd, bins = 5, color= 'lime', edgecolor = 'black', log = True) #Plotting the histogram of Y
plt.title('Funding Distribution of Startups with Funding < 500M (US)')
plt.xlabel('Total Funding (USD)')
plt.ylabel('Log of No of Startups')
plt.rcParams['figure.figsize'] = [20, 10]
plt.ticklabel_format(style = 'plain', axis = 'x') #changing scientific notation to plain format.
plt.hist(df.funding_total_usd, bins = 5, range = (0, 1000000000), color= 'orange', edgecolor = 'black', log = True) #Plotting the histogram of Y
plt.title('Funding Distribution of Startups with Funding < 100M(US)')
plt.xlabel('Total Funding (USD)')
plt.ylabel('Log of No of Startups')
plt.rcParams['figure.figsize'] = [20, 10]
plt.ticklabel_format(style = 'plain', axis = 'x') #changing scientific notation to plain format.
plt.hist(df.funding_total_usd, bins = 10, range = (1000000, 200000000), color= 'yellow', edgecolor = 'black', log = True) #Plotting the histogram of Y
plt.title('Funding Distribution of Startups with funding < 200M (US)')
plt.xlabel('Total Funding (USD)')
plt.ylabel('Log of No of Startups')
plt.rcParams['figure.figsize'] = [20, 10]
plt.ticklabel_format(style = 'plain', axis = 'x') #changing scientific notation to plain format.
df['state_code'].value_counts().plot(kind='bar',
figsize=(30,10),
title="Number of Startups per State", color= 'tomato', xlabel = 'State',
ylabel = 'No of Startups',
fontsize ='large')
#Plotting the count of our X variable(state_code).
<AxesSubplot:title={'center':'Number of Startups per State'}, xlabel='State', ylabel='No of Startups'>
df['category_code'].value_counts().plot(kind='bar',
figsize=(25,10),
title="Startups per Industry", color= 'dodgerblue', xlabel = 'Industry',
ylabel = 'No of Startups',
fontsize ='large')
<AxesSubplot:title={'center':'Startups per Industry'}, xlabel='Industry', ylabel='No of Startups'>
gb_location = df.groupby(['state_code']) # creating a groupby object for the states.
gb_location['funding_total_usd'].agg(['median', 'mean', 'count'])
gb_loc = gb_location['funding_total_usd'].agg(['median', 'mean', 'count'])
gb_sector = df.groupby(['category_code']) #creating a groupby object for the category.
gb_sec = gb_sector['funding_total_usd'].agg(['median', 'mean', 'count'])
def iszero (x)-> bool:
return (x == 0).sum() #creating a function to identify if the funding is zero for a particular startup
#function takes in a series and returns the number of zero values in the series.
zerofunding_state = gb_location['funding_total_usd'].apply(iszero)
no_bylocation = df['state_code'].value_counts()
df_zerofunding = pd.concat([no_bylocation, zerofunding_state], axis = 'columns', sort = False)
#creating a new dataframe for our zero values in each state
df_zerofunding.rename(columns = {'state_code': 'Num_Startups', 'funding_total_usd': 'Num_ZeroFunding'}, inplace = True)
df_zerofunding['Pct_rec_zerofunding']= (df_zerofunding['Num_ZeroFunding']/df_zerofunding['Num_Startups'])*100
bar_zero = df_zerofunding['Pct_rec_zerofunding'].plot(kind = 'bar', color = 'green', width = 0.4, rot =0,
figsize = (50,20), title = "Percentage Startups W No Funding",
xlabel = 'States', ylabel = 'Percentage(%)', fontsize = 35)
bar_zero.title.set_size(40)
bar_zero.xaxis.label.set_size(40)
bar_zero.yaxis.label.set_size(40)
no_bysector = df['category_code'].value_counts()
zerofunding_sector = gb_sector['funding_total_usd'].apply(iszero)
df_zerofundingsec = pd.concat([no_bysector, zerofunding_sector], axis = 'columns', sort = False)
# creating a dataframe for our zero values in each industry/category.
df_zerofundingsec.rename(columns ={'category_code':'Num_Startup_bysector', 'funding_total_usd': 'Num_zerofunding'},
inplace =True)
df_zerofundingsec['Pct_rec_zerofunding']= (df_zerofundingsec['Num_zerofunding']/df_zerofundingsec['Num_Startup_bysector'])*100
bar_zero_sector = df_zerofundingsec['Pct_rec_zerofunding'].plot(kind = 'bar', color = 'indigo', width = 0.4,
figsize = (50,20), title = "Percentage Startups W No Funding per Sector",
xlabel = 'Sector', ylabel = 'Percentage(%)', fontsize = 35)
bar_zero_sector.title.set_size(40)
bar_zero_sector.xaxis.label.set_size(40)
bar_zero_sector.yaxis.label.set_size(40)
sns.set(rc = {'figure.figsize': (10,5)}, style = 'white', font_scale = 2)
fig, ax = plt.subplots(figsize=(20,20))
g = sns.boxplot(y= 'category_code', x= 'funding_total_usd', data = df, ax =ax, showfliers = False,
whis = 2.0, linewidth= 2.5).set(title = 'Funding Distribution Relative to Category')
#g.set_xticklabels(g.get_xticklabels(), rotation = 90)
fig, ax = plt.subplots(figsize=(20,30))
g = sns.boxplot(y= 'state_code', x= 'funding_total_usd', data = df, ax =ax, showfliers = False, palette = 'bright',
whis = 2.0, linewidth= 2.5).set(title ='Funding Distribution relative to State')
As I spoke earlier the reason for this study is to see what effect, if any, does the startup location and sector have on the success, which we are measuring as the funding received, of a startup.
We are going to run a linear regression model with y as the outcome variable and the location as the dummy variable to further understand the effect of the location on the startup funding.
dummies= pd.get_dummies(df2['state_code'])
merged = pd.concat([df2['funding_total_usd'], dummies],axis ='columns')
X1= merged.AK.values.reshape(-1, 1)
X2= merged.AL.values.reshape(-1, 1)
X3= merged.AR.values.reshape(-1, 1)
X4 = merged.AZ.values.reshape(-1, 1)
X5= merged.CA.values.reshape(-1, 1)
X6 = merged.CO.values.reshape(-1, 1)
X7= merged.CT.values.reshape(-1, 1)
X8 = merged.DC.values.reshape(-1, 1)
X9 = merged.DE.values.reshape(-1, 1)
X10 = merged.FL.values.reshape(-1, 1)
X11 = merged.GA.values.reshape(-1, 1)
X12 = merged.HI.values.reshape(-1, 1)
X13 = merged.IA.values.reshape(-1, 1)
X14 = merged.ID.values.reshape(-1, 1)
X15 = merged.IL.values.reshape(-1, 1)
X16 = merged.IN.values.reshape(-1, 1)
X17 = merged.KS.values.reshape(-1, 1)
X18 = merged.KY.values.reshape(-1, 1)
X19 = merged.LA.values.reshape(-1, 1)
X20 = merged.MA.values.reshape(-1, 1)
X21 = merged.MD.values.reshape(-1, 1)
X22 = merged.ME.values.reshape(-1, 1)
X23 = merged.MI.values.reshape(-1, 1)
X24 = merged.MN.values.reshape(-1, 1)
X25 = merged.MO.values.reshape(-1, 1)
X26 = merged.MS.values.reshape(-1, 1)
X27 = merged.MT.values.reshape(-1, 1)
X28 = merged.NC.values.reshape(-1, 1)
X29 = merged.ND.values.reshape(-1, 1)
X30 = merged.NE.values.reshape(-1, 1)
X31 = merged.NH.values.reshape(-1, 1)
X32 = merged.NJ.values.reshape(-1, 1)
X33 = merged.NM.values.reshape(-1, 1)
X34 = merged.NV.values.reshape(-1, 1)
X35 = merged.NY.values.reshape(-1, 1)
X36 = merged.OH.values.reshape(-1, 1)
X37 = merged.OK.values.reshape(-1, 1)
X38 = merged.OR.values.reshape(-1, 1)
X39 = merged.PA.values.reshape(-1, 1)
X40 = merged.RI.values.reshape(-1, 1)
X41 = merged.SC.values.reshape(-1, 1)
X42 = merged.SD.values.reshape(-1, 1)
X43 = merged.TN.values.reshape(-1, 1)
X44 = merged.TX.values.reshape(-1, 1)
X45 = merged.UT.values.reshape(-1, 1)
X46 = merged.VA.values.reshape(-1, 1)
X47 = merged.VT.values.reshape(-1, 1)
X48 = merged.WA.values.reshape(-1, 1)
X49 = merged.WI.values.reshape(-1, 1)
X50 = merged.WV.values.reshape(-1, 1)
X51 = merged.WY.values.reshape(-1, 1)
y = df2.funding_total_usd
def y_pred (x, y):
model = LinearRegression()
model.fit(x,y)
return model.predict(x)
fig, ax = plt.subplots(2,5, figsize = (30,15), sharey = True)
ax[0,0].plot(X1, y_pred(X1, y), color = 'b')
ax[0,0].plot(X2, y_pred(X2,y), color = 'r')
ax[0,0].plot(X3, y_pred(X3,y), color = 'g')
ax[0,0].plot(X4, y_pred(X4,y), color = 'y')
ax[1,3].plot(X5, y_pred(X5,y), color = 'c')
ax[0,0].plot(X6, y_pred(X6,y), color = 'm')
ax[0,0].plot(X7, y_pred(X7,y), color = 'k')
ax[0,1].plot(X8, y_pred(X8,y), color = 'b')
ax[0,1].plot(X9, y_pred(X9,y), color = 'r')
ax[0,1].plot(X10, y_pred(X10,y), color = 'g')
ax[0,1].plot(X11, y_pred(X11,y), color = 'y')
ax[0,1].plot(X12, y_pred(X12,y), color = 'c')
ax[0,1].plot(X13, y_pred(X13,y), color = 'm')
ax[0,1].plot(X14, y_pred(X14,y), color = 'k')
ax[0,2].plot(X15, y_pred(X15,y), color = 'b')
ax[0,2].plot(X16, y_pred(X16,y), color = 'r')
ax[0,2].plot(X17, y_pred(X17,y), color = 'g') # b, r, g, y, c, m, k
ax[0,2].plot(X18, y_pred(X18,y), color = 'y')
ax[0,2].plot(X19, y_pred(X19,y), color = 'c')
ax[1,4].plot(X20, y_pred(X20,y), color = 'm')
ax[0,2].plot(X21, y_pred(X21,y), color = 'k')
ax[0,3].plot(X22, y_pred(X22,y), color = 'b')
ax[0,3].plot(X23, y_pred(X23,y), color = 'r')
ax[0,3].plot(X24, y_pred(X24,y), color = 'g')
ax[0,3].plot(X25, y_pred(X25,y), color = 'y')
ax[0,3].plot(X26, y_pred(X26,y), color = 'c')
ax[0,3].plot(X27, y_pred(X27,y), color = 'm')
ax[0,3].plot(X28, y_pred(X28,y), color = 'k')
ax[0,4].plot(X29, y_pred(X29,y), color = 'b')
ax[0,4].plot(X30, y_pred(X30,y), color = 'r')
ax[0,4].plot(X31, y_pred(X31,y), color = 'g')
ax[0,4].plot(X32, y_pred(X32,y), color = 'y')
ax[0,4].plot(X33, y_pred(X33,y), color = 'c')
ax[0,4].plot(X34, y_pred(X34,y), color = 'm')
ax[1,4].plot(X35, y_pred(X35,y), color = 'k')
ax[1,0].plot(X36, y_pred(X36,y), color = 'b')
ax[1,0].plot(X37, y_pred(X37,y), color = 'r')
ax[1,0].plot(X36, y_pred(X36,y), color = 'g')
ax[1,0].plot(X38, y_pred(X38,y), color = 'y')
ax[1,0].plot(X39, y_pred(X39,y), color = 'c')
ax[1,0].plot(X40, y_pred(X40,y), color = 'm')
ax[1,0].plot(X41, y_pred(X41,y), color = 'k')
ax[1,1].plot(X42, y_pred(X42,y), color = 'b')
ax[1,1].plot(X43, y_pred(X43,y), color = 'r')
ax[1,1].plot(X44, y_pred(X44,y), color = 'g')
ax[1,1].plot(X45, y_pred(X45,y), color = 'y')
ax[1,1].plot(X46, y_pred(X46,y), color = 'c')
ax[1,1].plot(X47, y_pred(X47,y), color = 'm')
ax[1,1].plot(X48, y_pred(X48,y), color = 'k')
ax[1,2].plot(X49, y_pred(X49,y), color = 'b')
ax[1,2].plot(X50, y_pred(X50,y), color = 'r')
ax[1,2].plot(X51, y_pred(X51,y), color = 'g')
ax[0,0].set(ylabel = 'Funding USD (10s of Millions)')
ax[1,0].set(xlabel = 'State Dummy Variable', ylabel = 'Funding USD (10s of Millions)' )
ax[1,1].set(xlabel = 'State Dummy Variable')
ax[1,2].set(xlabel = 'State Dummy Variable')
ax[1,3].set(xlabel = 'State Dummy Variable')
ax[1,4].set(xlabel = 'State Dummy Variable')
ax[0,2].set(title = 'Predicted Funding Across States')
ax[0,0].annotate('AL', (0.5, 9500000), color ='r')
ax[1,3].annotate('CA', (0.5, 8000000), color = 'g')
ax[1,4].annotate('MA', (0.5, 9500000), color = 'm')
ax[1,4].annotate('NY', (0.5, 5000000), color = 'k')
ax[0,2].annotate('MD', (0.5, 8000000), color = 'k')
ax[1,1].annotate('WA', (0.7, 8500000), color = 'k')
ax[1,1].annotate('UT', (0.8, 7000000), color = 'y')
ax[1,1].annotate('TX', (0.8, 5900000), color = 'g')
Text(0.8, 5900000, 'TX')
Here we ran a simple linear regression with the location as the independant variable and the total funding as the y variable. The x axis represents a dummy variable where 1 is represents being that state and 0 represents not being the state. Our linear regression model predicts the funding that a startup is likely to receive based on which state it is in. For simplicity I plotted the California, Massachesetts and New York on the last two graphs at the bottom. We can see some interesting results from this graph. Firstly, as confirmed by our previous methods Californian and Massachusetts' startups do have a better funding, that is being located in California and Massachusetts seems to have a positive effect on the amount of funding received as opposed to not being located in CA or MA.
The surprising result is that being located New York and Texas actually has an adverse effect on the amount of funding received, so being located in New York or Texas is actually disadvantageous to startups based on this result. In the other graph we seem some even more surprising results. Being located in Alabama is good for your startup! I'm not sure if this is entirely due to some error or whether this is a significant finding. We also noticed that Washington, Maryland, and Utah seem to also be favorible locations for startups.
Other states have not been labeled as a majority of them seem to be having an adverse affect on startup investments. This may be due to something we had previously looked at that the majority of startups do not recieve any funding, so the state might not be entirely held accountable for the poor funding
state_df = gpd.read_file("http://www2.census.gov/geo/tiger/GENZ2016/shp/cb_2016_us_state_5m.zip")
gb_loc['log_count']= np.log(gb_loc['count'])
states = state_df.merge(gb_loc, left_on="STUSPS", right_on="state_code", how="inner")
state_df = gpd.read_file("http://www2.census.gov/geo/tiger/GENZ2016/shp/cb_2016_us_state_5m.zip")
fig, gax = plt.subplots(figsize=(15,10))
#world.query("name == 'United States of America'").plot(ax=gax, edgecolor='black',color='white')
state_df.plot(ax=gax, edgecolor="black", color="white")
state_df.head()
plt.xlim(-130, -60)
plt.ylim(22.5, 50)
#gax.set_title('No of Startups per state')
gax.annotate('Log No of Startups',xy=(0.5,0.06),xycoords='figure fraction')
states.plot(
ax=gax, edgecolor='black', column='log_count', legend=True, cmap='RdYlGn',
vmin=0, vmax=10 #range of your column value for the color legend
)
<AxesSubplot:>
Here we can see the number of startups per state according to the map of the US. Note that Hawaii and alaska is not shown here because of the size of the map did not allow them to be included without disrupting the other states.
from bokeh.io import output_notebook
from bokeh.plotting import figure, ColumnDataSource
from bokeh.io import output_notebook, show, output_file
from bokeh.plotting import figure
from bokeh.models import GeoJSONDataSource, LinearColorMapper, ColorBar, HoverTool
from bokeh.palettes import brewer
import bokeh.plotting
output_notebook()
import json
from bokeh.models import Range1d
#Convert data to geojson for bokeh
states_geojson=GeoJSONDataSource(geojson=states.to_json())
color_mapper = LinearColorMapper(palette = brewer['PuBuGn'][9], low = 10000000, high = 0)
color_bar = ColorBar(title = 'Highest Funding to Lowest Funding(USD)', color_mapper=color_mapper, label_standoff=10,width = 500
, height = 50,
border_line_color=None,location = (0,0), orientation = 'horizontal')
hover = HoverTool(tooltips = [('State','@NAME'),('Median','@median'),('Mean', '@mean'),
('Count','@count')])
p = figure(title="Mean total Startup Funding by Location", tools=[hover], height =750, width = 1000)
p.patches("xs","ys",source=states_geojson,
fill_color = {'field' :'mean', 'transform' : color_mapper})
p.title.text_font_size = '15pt'
p.add_layout(color_bar, 'below')
p.x_range= Range1d(-130, -63)
p.y_range= Range1d(23,50)
show(p)
Here we can notice that the mean funding for the different states. Consistent with our result from the regression we can see Alabamas name come up once again. This map gives us a good picture of the average startup funding in each state.
Once again we have performed some more tests to check the relationship between startup funding and location. From our evidence so far we can suggest that location has some effect on the startup funding although we cannot be entirely sure due to hidden variables. For the next project we will dive deeper and maybe try a multiple regression along with some other techniques for visualizing our results
As I want to see what effect the location has on the success of a startup, I also want to see if the finding I've had so far are consitent with other sources and research.In this section we look at whether the information for our previous experiments is accurate compared to an independent source. I will look at a CNBC article which ranks all the states in terms of an overall score for starting a business in the United States. The overall rank is determined by 85 metrics in 10 broad categories such as infrastructure, economy, etc. The rankings are from 2021, and the dataset also includes a ranking for how easy or hard it is to secure funding in each of the states. I beleieve this is important information that is missing from our original data and can help us learn more about our dataset. Particularly why some states have an enormous number of startups compared to other states. I will scrape this data from the website below and store it in a dataframe. The data from the website can then be merged with the orginal startup dataset to see whether startups in general seem to be following the advise from the aritcle. I am going to meausure this by looking at the number of startups in each state(from our original dataset) and then comparing that to the access to capital in that state and its overall ranking(from the article). I am not using an API for the scrapping so we will only need to scrape this data once.
The data source that I am using is easily scrappable as it is a website containing a table similar to what we've worked with in class.
import requests #first we import the requests module
url = 'https://www.cnbc.com/2021/07/13/americas-top-states-for-business.html' #this the URL that we will scrap the data from
response = requests.get(url) #this piece of code requests the URL from the server
response.headers #this gives us detailed information about the request we made to the server.
{'Content-Type': 'text/html; charset=utf-8', 'Set-Cookie': 'region=USA; expires=Wed, 22-Jun-2022 20:21:14 GMT; path=/; domain=.cnbc.com, akaas_CNBC_Audience_Segmentation=1650745274~rv=36~id=623e9c71b9145dd72aee714cadd27228; path=/; Expires=Sat, 23 Apr 2022 20:21:14 GMT; Domain=.www.cnbc.com; Secure; SameSite=None', 'X-Request-Id': 'f64bb176-baf9-4371-9603-240ba4f12c49', 'Vary': 'User-Agent, Accept-Encoding', 'Content-Security-Policy': "frame-ancestors 'self' *.cnbc.com *.acorns.com;", 'X-Aicache-OS': 'xx.xx.225.187:80, xxx.xx.14.238:81', 'Access-Control-Allow-Origin': '*', 'Content-Encoding': 'gzip', 'Expires': 'Thu, 24 Mar 2022 20:21:14 GMT', 'Cache-Control': 'max-age=0, no-cache', 'Pragma': 'no-cache', 'Date': 'Thu, 24 Mar 2022 20:21:14 GMT', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive, Transfer-Encoding'}
response.status_code #we also need to check the status code of our request using the .status_code attribute
#A status code of 200-299 indicates a successful response. Any other code usually means an error.
#The error code can be googled to get more information about what went wrong.
200
response.content[:2000] #this is the content that we received to from the server. It's in HTML.
#this looks very unstructured and seems hard to understand
#we need to transform this into something more readable.
b'<!DOCTYPE html><html lang="en" prefix="og=https://ogp.me/ns#" itemscope="" itemType="https://schema.org/WebPage"><head><script src="/assets/nbcu-common.js?single"></script><link rel="preload" as="font" href="https://static-redesign.cnbcfm.com/dist/icomoon.ttf" type="font/ttf" crossorigin="anonymous"/><link rel="preload" as="font" href="https://static-redesign.cnbcfm.com/dist/351C86_0_0.woff2" type="font/woff2" crossorigin="anonymous"/><link rel="preload" as="font" href="https://static-redesign.cnbcfm.com/dist/351C86_1_0.woff2" type="font/woff2" crossorigin="anonymous"/><link rel="preload" as="font" href="https://static-redesign.cnbcfm.com/dist/351C86_2_0.woff2" type="font/woff2" crossorigin="anonymous"/><link rel="preload" as="font" href="https://static-redesign.cnbcfm.com/dist/351C86_3_0.woff2" type="font/woff2" crossorigin="anonymous"/><link rel="preload" as="font" href="https://static-redesign.cnbcfm.com/dist/351C86_4_0.woff2" type="font/woff2" crossorigin="anonymous"/><link rel="preload" as="font" href="https://static-redesign.cnbcfm.com/dist/LyonText-Bold-Web.woff2" type="font/woff2" crossorigin="anonymous"/><link rel="preload" as="font" href="https://static-redesign.cnbcfm.com/dist/LyonText-Regular-Web.woff2" type="font/woff2" crossorigin="anonymous"/><meta name="format-detection" content="telephone=no"/><style type="text/css">@charset "UTF-8";.RecaptchaAcknowledgement-acknowledgement{-webkit-box-flex:1;color:#747474;-ms-flex:1;flex:1;font-size:11px;font-weight:600;line-height:15px;margin-bottom:7px;margin-top:24px;min-width:295px}.RecaptchaAcknowledgement-acknowledgement a{color:#747474;font-weight:500;text-decoration:none}.RecaptchaAcknowledgement-acknowledgement a:hover{color:#747474;text-decoration:underline}.RecaptchaAcknowledgement-acknowledgement a:active{color:#747474}.RecaptchaAcknowledgement-leftAligned{text-align:left}.RecaptchaAcknowledgement-rightAligned{text-align:right}.AuthForms-container{margin:0 auto;padding:0 59px;text-align:center;width:458'
from bs4 import BeautifulSoup #we will use the Beautiful Soup library to transform our data from the website
#into something easier for us to understand.
soup = BeautifulSoup(response.content) #now we use the BeautifulSoup method to convert the content from the before into
#something more structured
#From the code above we need to find the table.
data_table = soup.find_all('table', 'BasicTable-table')[0]
# We find out the table and its type by directly going to the URL and rightclicking on the table and then choosing inspect.
#This allows us to see all the tags in HTML. We can read the tags to see what the table is and what type it is.
# We can see that the type of table used here in a BasicTable-table.
values = data_table.find_all('tr') #we need to find the tag associated with each row in the table
#after examining the tag from the website we can tell this tag is the tr tag which stands for table row.
values[0]#check to see if the values have been filled
<tr><th class="BasicTable-numData"><span>OVERALL<!-- --> <span class="icon-sort undefined"></span></span></th><th class="BasicTable-textData"><span>State<!-- --> <span class="icon-sort undefined"></span></span></th><th class="BasicTable-numData"><span>Cost of Doing Business<!-- --> <span class="icon-sort undefined"></span></span></th><th class="BasicTable-numData"><span>Infrastructure<!-- --> <span class="icon-sort undefined"></span></span></th><th class="BasicTable-numData"><span>Life, Health & Inclusion<!-- --> <span class="icon-sort undefined"></span></span></th><th class="BasicTable-numData"><span>Workforce<!-- --> <span class="icon-sort undefined"></span></span></th><th class="BasicTable-numData"><span>Economy<!-- --> <span class="icon-sort undefined"></span></span></th><th class="BasicTable-numData"><span>Business Friendliness<!-- --> <span class="icon-sort undefined"></span></span></th><th class="BasicTable-numData"><span>Access to Capital<!-- --> <span class="icon-sort undefined"></span></span></th><th class="BasicTable-numData"><span> Technology & Innovation<!-- --> <span class="icon-sort undefined"></span></span></th><th class="BasicTable-numData"><span>Education<!-- --> <span class="icon-sort undefined"></span></span></th><th class="BasicTable-numData"><span>Cost of Living<!-- --> <span class="icon-sort undefined"></span></span></th></tr>
#We've already extracted all the values that we need for this project
#Let's put them into a data frame.
top_states = pd.DataFrame(columns = ['Rank', 'State', 'access_to_capital']) # Create an empty dataframe
ix = 0 # Initialise index to zero
for row in values[1:]:
values = row.find_all('td') # Extract all elements with tag <td>. The <td> tag contained the information within a cell in a table.
rank = values[0].text.strip()# the find index is the overall rank. We'll store it in the rank variable.
state = values[1].text.strip()# the second in the name of each state. We'll store it in the state variable.
capital_access = values[8].text.strip()# the eight item in the list is access to capital. We'll store it in the capital_access variable.
top_states.loc[ix] = [rank, state, capital_access] # Store it in the dataframe as a row
ix += 1
# Print the first 10 rows of the dataframe
top_states.head(10)
| Rank | State | access_to_capital | |
|---|---|---|---|
| 0 | 1 | Virginia | 9 |
| 1 | 2 | North Carolina | 8 |
| 2 | 3 | Utah | 17 |
| 3 | 4 | Texas | 3 |
| 4 | 5 | Tennessee | 14 |
| 5 | 6 | Georgia | 10 |
| 6 | 7 | Minnesota | 14 |
| 7 | 8 | Colorado | 16 |
| 8 | 9 | Washington | 11 |
| 9 | 10 | Ohio | 7 |
Now we have all the data that we need for our experiment in a dataframe.
best_codes = ['VA', 'NC', 'UT', 'TX', 'TN', 'GA', 'MN', 'CO', 'WA', 'OH', 'MI', 'MD', 'NE', 'MA', 'IL', 'ID', 'FL', 'ND', 'IN', 'IA', 'WI', 'NY', 'PA',
'CT', 'MO', 'NJ', 'DE', 'KS', 'SD', 'AZ', 'AL', 'OK', 'CA', 'MT', 'OR', 'WY', 'NH', 'NM', 'SC', 'NV', 'KY', 'VT', 'AR', 'LA',
'MS', 'RI', 'WV', 'ME', 'HI', 'AK']
Our dataframe does not contain state codes only state names. If we are to merge it with the datasets we've been working with so far we need state codes. Therefore, I will create a list of state codes in the order of their overall ranking according to the CNBC website.
top_states['code']= best_codes #adding an column for statecode
First, I am going to create a scatterplot just to look at the data that we've put into our dataframe to draw some observation about this new data. I am mainly interested in figuring out if there is some relationship between overall rank and the access to capital across the states.
source = ColumnDataSource(top_states)
hover = HoverTool(tooltips = [('State','@State'),('Rank', '@Rank'), ('Capital Access', '@access_to_capital') ])
mapper = LinearColorMapper(palette= brewer['PuBuGn'][9], low= 50 , high= 1)
e = figure(plot_width = 500, plot_height = 500, title = "Rank vs Access to Capital Across States",toolbar_location=None,
tools=[hover])
e.circle(x = 'access_to_capital', y = 'Rank', size=10, source=top_states)
e.xaxis.axis_label = 'Access to Capital'
e.yaxis.axis_label = 'Overall Rank'
e.x_range= Range1d(0, 55)
e.y_range= Range1d(0,55)
show(e)
As we can see here there is some relationship between access to capital and the overall rank however from our scatter plot above it seems that the relationship is not particularly strong. We can see states such as Nebraska, Idaho rank better overall however don't seem to do so well when it comes to access to capital. Alternatetively businesses in California seem to have greater access to funding however do not do so well overall.
Now that we have analyzed our extracted data. It is now time to merge it with our original data. I will merge our extracted data gb_loc dataframe that we had created earlier.
best_states = gb_loc.merge(top_states, left_on = 'state_code', right_on = 'code', how= 'inner')
We're going to create an interactive map similar to what we had created earlier however this time the fill of our map will be represented by the rankings from the CNBC ariticle instead of the mean funding for each state. For this we will have to merge our best_states dataframe with the state_df dataframe that we downloaded earlier.
states_for_business = state_df.merge(best_states, left_on="STUSPS", right_on="code", how="inner")
best_states_geojson=GeoJSONDataSource(geojson=states_for_business.to_json())
color_mapper = LinearColorMapper(palette = brewer['PuBuGn'][9], low = 0, high = 50, name = 'Best to Worst')
color_bar = ColorBar(title = 'Best Overall to Worst Overall', color_mapper=color_mapper, label_standoff=10,width = 500
, height = 50,
border_line_color=None,location = (0,0), orientation = 'horizontal')
hover = HoverTool(tooltips = [('State','@NAME'),('Median','@median'),('Mean', '@mean'),
('Count','@count'), ('Overall_Rank', '@Rank'), ('Access to Capital', '@access_to_capital')])
p = figure(title="Best States to Start a Business", tools=[hover], height =750, width = 1000)
p.patches("xs","ys",source=best_states_geojson,
fill_color = {'field' :'Rank', 'transform' : color_mapper})
p.title.text_font_size = '15pt'
p.add_layout(color_bar, 'below')
p.x_range= Range1d(-130, -63)
p.y_range= Range1d(23,50)
show(p)
We can see some inconsistency between the mean funding across states which found previously and the overall ranking for best states to start a business. For example California which has the most amount of startups in the United States has lower overall ranking. Texas seems to be a state which happens to be a state that score high in the overall ranking and was also had a relatively high average finding. Another factor that may have been cause an underlying difference between the ranking and the mean funding could be that the ranking are from 2021, whereas some of the startups in our dataset have been in business for far longer. It is reasonable to assume that the economic conditions in each of the states have changed over the years in order to create a suitable environment for startups or vice versa. Our findings so far have have suggested that being located in certain states is beneficial to your business. Specifically it seems that Washington and Texas have been better location for your startup to be successful in recent years.
Link to view interactive maps and plots: https://utoronto-my.sharepoint.com/:u:/g/personal/momair_khan_mail_utoronto_ca/ETNwqu4YAsdFvxsvpI19tWYBr74hEu6c-LoGSjjs4RkwSQ?e=PJwVSw